## Warning: replacing previous import 'BiocGenerics::var' by 'stats::var' when
## loading 'MLInterfaces'

TMT labeling efficiency

# Read data in
setwd("~/P239_Ivana_Celardo/analysis")
f <- "../rawdata/Experiment_3_effic_PeptideGroups.txt"
e <- grepEcols(f, "Abundance F", split = "\t")
#getEcols(f, split = "\t")
#e<-c(53:62)
xf <- readMSnSet2(f, e, sep = "\t")

# How many high quality peptides are modified
hqp <- sum(fData(xf)$Confidence == "High")
hqpm <- sum(fData(xf)$Confidence == "High" & grepl("TMT", fData(xf)$Modifications))
perc <- round(hqpm / hqp * 100, 0)

Incorporation of the tags was evaluated by the number of peptides with the respective modification in the N-terminal and/or lysine residues.

The labelling efficiency is: 99%.



Exploration data analysis

Raw data distribution

# Read proteins data in
f <- "../rawdata/Experiment_3_Proteins.txt" #make sure r-friendly headers
e <- grepEcols(f, "Abundance F", split = "\t")
x <- readMSnSet2(f, e, sep = "\t")

#getEcols(f, split = "\t")
#e<-c(53:62)
# save(x, file = "../rscript/P164_TMT_10_plex_result_Proteins.rda")

# Read saved data
#load("../rscript/P164_TMT_10_plex_result_Proteins.rda")

## Data annotation
# create pData
Sample <- gsub("Abundance\\.F1\\.\\w*\\.Sample\\.(\\d\\.+\\w*)", "\\1", colnames(exprs(x)))
tmp <- data.frame(do.call(rbind, strsplit(Sample, "[..]")))
tmp<-tmp[,c(3,1)]
names(tmp) <- c("Sample", "Replicate")
sampleNames(x)<-paste(tmp$Sample, tmp$Replicate, sep = "")
rownames(tmp) <- paste(tmp$Sample, tmp$Replicate, sep = "")
pData(x) <- tmp

## Number of high confidance proteins
cp <- data.frame(table(fData(x)$Protein.FDR.Confidence))
colnames(cp)[1] <- "Protein.FDR.Confidence"
tmp <- x[fData(x)$Protein.FDR.Confidence == "High",]

The rawdata file contains 5634 proteins. The protein FDR confidance for these proteins can be:

  • High [<0.01]

  • Medium [<0.05]

  • Low [>0.05]


From now on the analysis will be carry on with only the high confidance proteins.

The new dataset contains 4965 proteins.

Check for missing values

## Plot missing values
naplot(tmp)

x <- filterNA(tmp, 0)


All the missing values have been removed. The analysis will only includes the proteins with a value for each channel.

The dataset used for the analysis contains 4822 proteins.

Principal component analysis

The protein quantification values have been log2 transformed and a PC analysis has been performed to visualise the data correlation.

x <- log(x,2)
plot2D(t(x), fcol = "Sample", cex = 2, addLegend = "bottomleft")


The follow plot shows the distribution and correlation for every sample, with all the other samples.

Boxplot

boxplot(exprs(x), col = rep(c("yellow", "blue"), each = 5))

The boxplot shows that the CONTR distributions have very similar median, while the PERK samples show more variability.

Sample normalisation

Samples have been normalised using each channel median.

x <- normalise(x, "diff.median")

Boxplot of normalised samples

boxplot(exprs(x), col = rep(c("yellow", "blue"), each = 5))

### There is a negative point!!!!

 PCA of normalised samples

plot2D(t(x), fcol = "Sample", cex = 2, addLegend = "bottomleft")

After normalisation the PCA plot looks better, with a clear separation on the PC1 of the two samples.

Here you can see the normalised data. This dataset is the one use for the differential expression analysis.

Differential expression analysis (DEA)

To perform the DEA we used limma package.

library(limma)
library(plotly)

## Differential analysis
## Model matrices
mat <- model.matrix(~ 0 + pData(x)$Sample)
colnames(mat) <- c("daGAL_Ab_ARC", "daGAL4_PLUS")
pData(x)$cmat <- mat

## Functions
dostats <- function(x) {
    cont.matrix <- makeContrasts(diffexp  = daGAL_Ab_ARC - daGAL4_PLUS,
                                 levels   = pData(x)[, "cmat"])
    ## Statistical analysis
    fit <- lmFit(exprs(x), pData(x)[, "cmat"])
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    ## Result tables
    tt1 <- topTable(fit2, adjust.method = "BH", coef = "diffexp", number = Inf)
}

## Statistical analysis
res <- dostats(x)

## Prepare data to be saved
fData(x)$diffexp <- res[featureNames(x), ]
dfr <- cbind(data.frame(exprs(x)),
             descr = fData(x)$Description,
             logFC = fData(x)$diffexp[, "logFC"],
             adj.P.Val = fData(x)$diffexp[, "adj.P.Val"])

plot_ly(dfr, x = ~logFC, y = ~-log10(adj.P.Val), type = "scatter",
        text = ~paste("ID: ", rownames(dfr),
                      "\nDesc: ", descr,
                      "\nFC: ", round(logFC, 2),
                      "\n-log10 adj.Pval: ", round(-log10(adj.P.Val), 3),
                      "\nadjusted pvalue: ", adj.P.Val))